# Set of climatic variables# =========================clim_var <-readRDS(here("data/ClimaticData/NamesSelectedVariables.rds"))# Climatic data# =============# we scale the past and future climatic data at the location of the populations# with the parameters (mean and variance) of the past climatic data.source(here("scripts/functions/generate_scaled_clim_datasets.R"))clim_dfs <-generate_scaled_clim_datasets(clim_var,clim_ref_adj =FALSE)
Running GF models
Code
snp_sets <-lapply(snp_sets, function(snp_set) {# Warning! Important to sort the SNP names, otherwise# the colors in the species (allele) cumulative plots # do not correspond to the right allelesgeno_sub <- geno %>% dplyr::select(all_of(sort(snp_set$set_snps)))snp_set$gf_mod <-gradientForest(data.frame(clim_dfs$clim_ref[,-1], geno_sub), predictor.vars=clim_var, response.vars=colnames(geno_sub), corr.threshold=0.5, ntree=500, trace=T)return(snp_set)})
Warning in randomForest.default(x = X, y = spec_vec, maxLevel = maxLevel, : The response has five or fewer unique values. Are you sure you want to do regression?
Number of alleles for which the climatic variables have some predictive power:
Code
lapply(snp_sets, function(x){tibble("SNP set"=x$set_name,"Nb of alleles with predictive power"=x$gf_mod$species.pos.rsq)}) %>%bind_rows() %>%kable_mydf()
SNP set
Nb of alleles with predictive power
Candidate SNPs
69
Candidate SNPs (with correction)
45
Control SNPs
47
We generate some plots to evaluate the GF models (stored in the files GFplots_[SNP set code].pdf):
Predictor overall importance plots. They show the mean accuracy importance and the mean importance weighted by SNPs \(\mathcal{R}^2\).
Splits density plots. They show the binned split importance and location on each gradient (spikes), kernel density of splits (blacklines), of observations (red lines) and of splits standardised by observations density (bluelines). Each distribution integrates to predictor importance. These show where important changes in the abundance of multiple alleles are occurring along the gradient; they indicate a composition change rate.
Species (in our case alleles) cumulative plots. They show, for each SNPs, the cumulative importance distributions of splits improvement scaled by \(\mathcal{R}^2\) weighted importance, and standardised by density of observations. These show cumulative change in the presence of individual allele, where changes occur on the gradient, and the alleles changing most on each gradient.
Predictor cumulative plots. They show, for each predictor, the cumulative importance distributions of splits improvement scaled by \(\mathcal{R}^2\) weighted importance, and standardised by density of observations, averaged over all SNPs. These show cumulative change in overall allelic composition, and where changes occur on the gradient.
\(\mathcal{R}^2\)measure of the fit of the random forest model for each SNPs.
The code to generate those plots comes from ‘Example analysis of biodiversity survey data with R package gradientForest’ by C. Roland Pitcher, Nick Ellis and Stephen J. Smith (pdf available here).
Code
# ==============================# Functions to make the GF plots# ==============================# splits density plotmake_split_density_plot <-function(x){plot(x,plot.type="S",imp.vars=names(importance(x)),leg.posn="topright",cex.legend=1,cex.axis=1, cex.lab=1.2,line.ylab=0.9,par.args=list(mgp=c(1.5, 0.5,0),mar=c(3.1,1.5,0.1,1),omi =c(0.1, 0.3, 0.1, 0.1))) }# predictor cumulative plotmake_predictor_cumulative_plot <-function(x){plot(x, plot.type="C",imp.vars=names(importance(x)),show.species=F,common.scale=T,cex.axis=1, cex.lab=1.5,line.ylab=1,par.args=list(mgp=c(1.5, 0.5,0),mar=c(2.5,1,0.1,0.5),omi=c(0, +0.3,0,0)))}# species cumulative plotmake_species_cumulative_plot <-function(x){plot(x,plot.type="C",imp.vars=names(importance(x)), show.overall=F,legend=T,leg.posn="topleft", leg.nspecies=10,cex.lab=1,cex.legend=1, cex.axis=1,line.ylab=1,par.args=list(mgp=c(1.5, 0.5,0),mar=c(2.5,1,0.1,0.5),omi=c(0, +0.3,0,0))) }# R2 measure of the fit of the random forest model for each speciesmake_performance_plot <-function(x, horizontal=FALSE){ old.mar<-par()$marpar(mfrow=c(1,1),mar=old.mar+c(0,0,0,0)) Ylab <-expression(R^2) perf <-importance(x, type="Species") n <-length(perf)if (horizontal)plot(perf, 1:n, las =2, pch=19, axes=F, xlab="", ylab="")elseplot(1:n, perf, las =2, pch=19, axes=F, xlab="", ylab="")axis(labels=names(perf),side=1+horizontal,at=1:n, cex.axis=0.7, padj=0,las=2)axis(side=2-horizontal, cex.axis=1)mtext(Ylab,side=2-horizontal,line=2)title("Overall performance of random forests over loci")abline(h =0, lty =2)box()par(mar=old.mar)}# ===============================================# Generate the GF plots for the Github repository# ===============================================lapply(snp_sets, function(snp_set){pdf(here(paste0("figs/GF/GFplots_",snp_set$set_code,".pdf")), width=12,height=8)# Overall importance plotplot(snp_set$gf_mod, plot.type="Overall.Importance")# splits density plotmake_split_density_plot(x=snp_set$gf_mod)# species cumulative plotmake_species_cumulative_plot(x=snp_set$gf_mod)# predictor cumulative plotmake_predictor_cumulative_plot(x=snp_set$gf_mod)# R2 measure of the fit of the random forest model for each speciesmake_performance_plot(x=snp_set$gf_mod)dev.off()})# =======================================================# Generate the GF plots for the Supplementary Information# =======================================================lapply(snp_sets[c(1,3)], function(snp_set){# Overall importance plotpdf(here(paste0("figs/GF/GFplots_OverallImportance_",snp_set$set_code,"_SI.pdf")), width=8,height=5)plot(snp_set$gf_mod, plot.type="Overall.Importance")dev.off()# Splits density plotpdf(here(paste0("figs/GF/GFplots_SplitDensityPlot_",snp_set$set_code,"_SI.pdf")), width=8,height=5)make_split_density_plot(x=snp_set$gf_mod)dev.off()# Allele cumulative plotpdf(here(paste0("figs/GF/GFplots_AlleleCumulativePlot_",snp_set$set_code,"_SI.pdf")), width=7,height=7)make_species_cumulative_plot(x=snp_set$gf_mod)dev.off()# predictor cumulative plotpdf(here(paste0("figs/GF/GFplots_PredictorCumulativePlot_",snp_set$set_code,"_SI.pdf")), width=7,height=7)make_predictor_cumulative_plot(x=snp_set$gf_mod)dev.off()# R2 measure of the fit of the random forest model for each speciesp <-tibble(snp=names(sort(snp_set$gf_mod$result)),importance=sort(snp_set$gf_mod$result)) %>%ggplot() +geom_point(aes(y=reorder(snp, importance),x=importance)) +ylab("") +xlab(expression(R^2)) +theme_bw() ggsave(p, filename =here(paste0("figs/GF/GFPlots_AlleleImportance_",snp_set$set_code,"_SI.pdf")), width=4, height=10)})
GO predictions
Code
snp_sets <-lapply(snp_sets, function(snp_set){snp_set$go <-lapply(clim_dfs$clim_pred, function(clim_pred){ref_pred <-predict(snp_set$gf_mod) # predictions under current climatesfut_pred <-predict(snp_set$gf_mod, as.data.frame(clim_pred[,clim_var])) # predictions under future climateslapply(1:nrow(ref_pred), function(x, ref_pred, fut_pred){as.numeric(pdist(ref_pred[x,], fut_pred[x,])@dist)}, fut_pred=fut_pred, ref_pred=ref_pred) %>%unlist()})return(snp_set)})
Relationship with Euclidean distance
Code
source(here("scripts/functions/make_eucli_plot.R"))# Calculate the Euclidean climatic distancelist_dist_env <- clim_dfs$clim_pred %>%lapply(function(clim_pred){Delta = clim_dfs$clim_ref %>% dplyr::select(any_of(clim_var)) - clim_pred %>% dplyr::select(any_of(clim_var)) dist_env =sqrt( rowSums(Delta^2) )})# Main gene pools (for the figures)gps <-readRDS(here("data/GenomicData/MainGenePoolPopulations.rds")) %>%arrange(pop)
We look at the correlation across the different genomic offset predictions at the location of the populations, i.e. those based on all SNPs and those based on sets of candidates or control SNPs.
# Function to make the genomic offset mapssource(here("scripts/functions/make_go_map.R"))# Population coordinatespop_coord <-readRDS(here(paste0("data/ClimaticData/MaritimePinePops/ClimatePopulationLocationPointEstimates_ReferencePeriods_noADJ.rds")))[[1]]$ref_means %>% dplyr::select(pop,longitude,latitude)# Find minimum and maximum values of genomic offset for the mapsgo_limits <-lapply(snp_sets, function(x) {lapply(names(list_dist_env), function(gcm){x$go[[gcm]]}) %>%unlist()}) %>%unlist() %>%range()# The minimum GO value is very very small, almost zero, so we fix it to zero.go_limits[[1]] <-0# Generate the maps for each set of SNPs and each GCMlapply(snp_sets, function(x) {go_maps <-lapply(names(list_dist_env), function(gcm){make_go_map(dfcoord=pop_coord,snp_set = x,gcm=gcm,ggtitle=gcm,go_limits = go_limits,point_size =3)})legend_maps <-get_legend(go_maps[[1]])go_maps <-lapply(go_maps, function(y) y +theme(legend.position ="none"))go_maps$legend_maps <- legend_mapsgo_maps <-plot_grid(plotlist=go_maps)# =====================================================# We save the figures for the Supplementary Information# =====================================================if(x$set_code=="cand"){ ggsave(here("figs/GF/GOMaps_PopLocations_CandidateSNPs_SI.pdf"), go_maps, width=10,height=6, device="pdf") } elseif(x$set_code=="control"){ggsave(here("figs/GF/GOMaps_PopLocations_ControlSNPs_SI.pdf"), go_maps, width=10,height=6, device="pdf") }# =========# Add title# =========title <-ggdraw() +draw_label( x$set_name,fontface ='bold',x =0,hjust =0 ) +theme(plot.margin =margin(0, 0, 0, 7))# merge title and plotsplot_grid( title, go_maps,ncol =1,rel_heights =c(0.1, 1))})
For each GCM, we attribute the value 1 to the top five populations with the highest genomic offset and we attribute the value 0 to the other populations. We then count the number of 1 for each population, which gives the table and map below:
Code
source(here("scripts/functions/make_high_go_pop_maps.R"))high_go_pops <-make_high_go_pop_maps(pop_coord=pop_coord,list_go = snp_sets$cand$go,ggtitle="GF",nb_id_pop =5) # number of selected populationssaveRDS(high_go_pops, file =here("outputs/GF/high_go_pops.rds"))high_go_pops[[1]] %>% kable_mydf
pop
longitude
latitude
GFDL-ESM4
IPSL-CM6A-LR
MPI-ESM1-2-HR
MRI-ESM2-0
UKESM1-0-LL
sum_go
PUE
-6.63
43.55
0
0
0
0
0
0
LEI
-8.96
39.78
0
0
0
0
0
0
PIA
9.46
42.02
0
0
0
0
0
0
QUA
-0.36
38.97
0
0
0
0
0
0
OLB
-0.62
40.17
0
0
0
0
0
0
LAM
-6.22
43.56
0
0
0
0
0
0
COC
-4.50
41.25
0
0
0
0
0
0
BON
-1.66
39.99
0
0
0
0
0
0
TAM
-5.02
33.60
0
0
0
0
0
0
CUE
-4.48
41.37
0
0
0
0
0
0
CAR
-4.28
41.17
0
0
0
0
0
0
MAD
-5.23
35.18
0
0
0
0
0
0
SAL
-3.06
41.83
0
0
0
0
0
0
CAS
-6.98
43.50
0
0
0
0
0
0
CAD
-6.42
43.54
0
0
0
0
0
0
BAY
-2.88
41.52
0
0
0
0
0
0
PLE
-2.34
47.78
0
0
0
0
0
0
HOU
-1.15
45.18
0
0
0
0
0
0
SIE
-6.49
43.53
0
0
0
0
0
0
STJ
-2.03
46.76
0
0
0
0
0
0
ORI
-2.35
37.53
0
0
0
0
0
0
PIE
9.04
41.97
0
0
0
0
0
0
SAC
-8.36
42.12
0
0
0
0
0
0
VER
-1.09
45.55
0
0
0
0
0
0
ARN
-5.12
40.19
0
0
0
0
0
0
CEN
-4.49
40.28
0
0
1
0
0
1
VAL
-4.31
40.52
0
0
1
0
0
1
MIM
-1.30
44.13
0
1
0
0
0
1
OLO
-1.83
46.57
0
0
0
1
1
2
PET
-1.30
44.06
1
1
0
0
0
2
COM
-3.95
36.83
1
1
0
1
1
4
SEG
-8.45
42.82
1
0
1
1
1
4
ALT
-6.49
43.28
1
1
1
1
1
5
ARM
-6.46
43.30
1
1
1
1
1
5
Code
high_go_pops[[2]]
Validation - NFI plots
Code
# Load the climatic data of the NFI plots.nfi_clim <-readRDS(here("data/ClimaticData/NFIplots/NFIclimate.rds"))# Keep only the climatic variables of interest and scale the climatic datanfi_dfs <-generate_scaled_clim_datasets(clim_var, clim_ref = nfi_clim$clim_ref, clim_pred = nfi_clim$clim_survey)# calculate the genomic offset for the NFI plotssnp_sets <-lapply(snp_sets, function(snp_set){ref_pred <-predict(snp_set$gf_mod, as.data.frame(nfi_dfs$clim_ref[,clim_var])) # predictions under reference-period climatesfut_pred <-predict(snp_set$gf_mod, as.data.frame(nfi_dfs$clim_pred[,clim_var])) # predictions under climates during survey periodsnp_set$go_nfi <-lapply(1:nrow(ref_pred), function(x, ref_pred, fut_pred){as.numeric(pdist(ref_pred[x,], fut_pred[x,])@dist)}, fut_pred=fut_pred, ref_pred=ref_pred) %>%unlist()return(snp_set)})# checking missing data# lapply(snp_sets, function(x) sum(is.na(x$go_nfi)))# Find minimum and maximum values of genomic offset for the mapsgo_limits <-lapply(snp_sets, function(snp_set) snp_set$go_nfi) %>%unlist() %>%range()# The minimum GO value is very very small, almost zero, so we fix it to zero.go_limits[[1]] <-0# map genomic offset predictions in the NFI plots lapply(snp_sets, function(x) { p <-make_go_map(dfcoord=readRDS(here("data/ClimaticData/NFIplots/NFIclimate.rds"))[[1]] %>% dplyr::select(contains("ude")), snp_set = x,type="NFI",point_size =0.5,go_limits = go_limits,legend_position =c(0.85,0.15),legend_box_background ="gray80",y_limits =c(35, 51))# Figure for the SI# =================if(x$set_code=="cand"){ p_SI <- p +theme(plot.title =element_blank())ggsave(here("figs/GF/NFI_GOmap_CandidateSNPs_SI.pdf"), p_SI, width=8,height=8, device="pdf") } elseif(x$set_code=="control"){ p_SI <- p +theme(plot.title =element_blank())ggsave(here("figs/GF/NFI_GOmap_ControlSNPs_SI.pdf"), p_SI, width=8,height=8, device="pdf") }# Show maps in the Quarto document# ================================p })
We look at the correlation across the different genomic offset predictions in the NFI plots, i.e. those based on all SNPs and those based on sets of candidates or control SNPs.
cg_clim <-readRDS(here("data/ClimaticData/CommonGardens/ClimateCG.rds")) %>% dplyr::select(cg,any_of(clim_var))cg_coord <-readRDS(here("data/ClimaticData/CommonGardens/ClimateCG.rds")) %>% dplyr::select(cg,contains("ude"))cg_names <-unique(cg_coord$cg)# Generate scaled climatic datasets with climatic data at the location of the populations and at the location of the common gardenscg_dfs <-generate_scaled_clim_datasets(clim_var, clim_pred = cg_clim)# Predict genomic offset of each population when transplanted in the climate of the common gardenssnp_sets <-lapply(snp_sets, function(snp_set){ref_pred <-predict(snp_set$gf_mod, as.data.frame(cg_dfs$clim_ref[,clim_var])) # predictions under reference-period climatesfut_pred <-predict(snp_set$gf_mod, as.data.frame(cg_dfs$clim_pred[,clim_var])) # predictions under climates during survey periodsnp_set$go_cg <-lapply(1:nrow(ref_pred), function(x, ref_pred, fut_pred){as.numeric(pdist(ref_pred[x,], fut_pred)@dist)}, fut_pred=fut_pred, ref_pred=ref_pred) %>%setNames(cg_dfs$clim_ref[["pop"]]) %>%as.data.frame() %>%t() %>%as.data.frame() %>%set_colnames(cg_dfs$clim_pred[["cg"]]) %>%rownames_to_column(var="pop") %>%as_tibble()return(snp_set)})# Map genomic offset predictions at the locations of the populationsgo_maps_cg <-lapply(cg_names, function(cg_name){# Find minimum and maximum values of genomic offset for the mapsgo_limits <-lapply(snp_sets, function(snp_set) snp_set$go_cg[[cg_name]]) %>%unlist() %>%range()go_limits[[1]] <-0p <-lapply(snp_sets, function(x) make_go_map(dfcoord=pop_coord, snp_set = x,point_size =3,type="CG",go_limits = go_limits,cg_name=cg_name,cg_coord=cg_coord))plot_grid(p[[1]],p[[2]],p[[3]],nrow=1)}) %>%setNames(cg_names)pdf(here("figs/GF/GOmaps_CGs.pdf"), width=15,height=6)lapply(go_maps_cg, function(x) x)dev.off()# show mapslapply(go_maps_cg, function(x) x)
We look at the correlation across the different genomic offset predictions in the common gardens, i.e. those based on all SNPs and those based on sets of candidates or control SNPs.
Fitzpatrick, Matthew C, Vikram E Chhatre, Raju Y Soolanayakanahally, and Stephen R Keller. 2021. “Experimental Support for Genomic Prediction of Climate Maladaptation Using the Machine Learning Approach Gradient Forests.”Molecular Ecology Resources.